The Washington Post compiled database of every fatal police shooting in the United States by a police officer in the line of duty since January 1, 2015. Click here to view/download the data.
Click on the following tabs to uncover 6 interesting findings
#Check if the libraries are presently installed in the PC. If not then install them
list.of.packages <- c("ggplot2", "plotly", "ggthemes", "tidyverse", "forecast", "plyr")
new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
#Loading the packages
library(plotly)
library(ggplot2)
library(ggthemes)
library(tidyverse)
library(dplyr)
library(forecast)
# Reading in the csv file
police <- read.csv("C://Users//maitr//Desktop//SCU//R//Project 3//database.csv")
#Checking the type of data
head(police)
summary(police)
## id name date
## Min. : 3.0 : 19 03-02-2017: 8
## 1st Qu.: 664.5 Brandon Jones : 2 07-07-2015: 8
## Median :1231.5 Daquan Antonio Westbrook: 2 10-02-2017: 8
## Mean :1232.2 Eric Harris : 2 14-12-2015: 8
## 3rd Qu.:1811.8 Jamake Cason Thomas : 2 21-12-2016: 8
## Max. :2394.0 Michael Johnson : 2 24-01-2017: 8
## (Other) :2113 (Other) :2094
## manner_of_death armed age gender race
## shot :1992 gun :1173 Min. : 6.00 : 1 : 103
## shot and Tasered: 150 knife : 319 1st Qu.:26.00 F: 89 A: 33
## unarmed : 155 Median :34.00 M:2052 B: 542
## vehicle : 136 Mean :36.48 H: 367
## undetermined: 101 3rd Qu.:45.00 N: 28
## toy weapon : 92 Max. :86.00 O: 28
## (Other) : 166 NA's :43 W:1041
## city state signs_of_mental_illness threat_level
## Los Angeles: 31 CA : 355 Mode :logical attack :1381
## Houston : 24 TX : 197 FALSE:1612 other : 629
## Phoenix : 24 FL : 129 TRUE :530 undetermined: 132
## Chicago : 23 AZ : 95
## Austin : 16 NC : 65
## Columbus : 16 OK : 65
## (Other) :2008 (Other):1236
## flee body_camera
## : 36 Mode :logical
## Car : 318 FALSE:1913
## Foot : 254 TRUE :229
## Not fleeing:1453
## Other : 81
##
##
We looked at the total deaths in each state by race and following are some of the insights:
#------------------------------------Interesting Finding 1--------------------------------------#
#Filtering the data by race and states and summarizing it
police_sr <- police %>%
filter(!race == "") %>%
group_by(state, race) %>%
summarise(deaths = n())
#Spreading the data to get deaths by race
spread_sr <- spread(police_sr, race, deaths)
#Getting the total number of deaths per state
spread_sr$Total <- rowSums(spread_sr[,-1], na.rm = TRUE)
#Check the results if needed
# head(spread_sr)
#Data setup for hovering on the map
sr_data <- spread_sr
sr_data$hover <- with(sr_data, paste("Asian", A, '<br>',
"Black",B,'<br>',
"Hispanic",H,'<br>',
"Native American",N,'<br>',
"Other",O,'<br>',
"White",W,'<br>',
"Total Deaths", `Total`))
#Map specifications
graph <- list(
scope = 'usa',
projection = list(type = 'albers usa'),
showlakes = TRUE,
lakecolor = toRGB('white')
)
#Plotting the graph
plot_ly(spread_sr, z = spread_sr$Total, text = sr_data$hover, locations = sr_data$state, type = 'choropleth',
locationmode = 'USA-states', color = spread_sr$Total, colors = 'Reds',
colorbar = list(title = "Counts ")) %>%
layout(title = 'Number of people shot dead by race per State<br>(Hover for breakdown by race)', geo = graph)
We are looking at the age of the suspect shot vs their race. The observations are as follows:
#------------------------------------Interesting Finding 2--------------------------------------#
#Filtering the data by age and race
police %>%
filter(!is.na(age) & race != '') %>% #Remove blanks and NAs
#Box plots
ggplot (aes(x=race, y=age)) +
geom_boxplot(aes(color = race)) +
#Axes labels and titles
labs(x = "Suspects' Race", y = "Suspecs' Age",
title = "Distribution of Suspects' Age across Race") +
scale_x_discrete(labels=c('White',
'Other',
'Native American',
'Hispanic',
'Black',
'Asian')) +
coord_flip() +
theme_bw() +
theme(legend.position = "none")
We looked at the deaths by race and gender and following are some of the insights:
#-----------------------------------Interesting Finding 3---------------------------------------#
#Filtering the data by race and gender
data_by_rg <- police %>%
filter(race != "") %>%
filter(gender != "") %>%
group_by(race, gender) %>%
summarize(No_of_deaths = n())
#Plotting Data
ggplot(data_by_rg, aes(x = race, y = No_of_deaths, fill = gender)) +
geom_bar(stat = "identity") +
labs (x = 'Race', y = 'Number of deaths') +
ggtitle('Deaths by race and gender') +
scale_x_discrete(labels=c('Asian',
'Black',
'Hispanic',
'Native American',
'Other',
'White')) +
theme_few()
We looked at the distribution of deaths by Race and top 5 armed categories. Following are some key observations:
#-----------------------------------Interesting Finding 4---------------------------------------#
# Finding the top 5 arms used by suspects
top_5_arms <- police %>%
group_by(armed) %>%
summarise(num_arms = n()) %>%
arrange(desc(num_arms)) %>%
head(5)
#Filtering the data by top 5 arms found
race_armed_data <- police %>%
filter(race != '') %>%
mutate(armed_mod = ifelse(armed %in% c('gun', 'knife', 'unarmed', 'vehicle', 'undetermined'), as.character(armed), 'Other')) %>%
group_by(race, armed_mod) %>%
summarise(Deaths = n())
# Spreading the data
race_armed_data_spr <- race_armed_data %>%
spread(armed_mod, Deaths)
#Replacing missing values with 0
race_armed_data_spr[is.na(race_armed_data_spr)] <- 0
print("% distribution of deaths by Armed Category in each Race")
## [1] "% distribution of deaths by Armed Category in each Race"
#Printing the data
summary_table <- cbind(as.data.frame(race_armed_data_spr[,1]), as.data.frame(round(race_armed_data_spr[,-1]/rowSums(race_armed_data_spr[,-1])*100, 2)))
#Renaming the race values
levels(summary_table$race) <- c("", "Asian", "Black", "Hispanic", "Native American", "Other", "White")
#Print the table
print(as.data.frame(summary_table))
## race gun knife Other unarmed undetermined vehicle
## 1 Asian 27.27 42.42 18.18 0.00 3.03 9.09
## 2 Black 57.93 10.52 9.59 10.89 4.24 6.83
## 3 Hispanic 49.32 16.35 12.53 7.90 7.63 6.27
## 4 Native American 60.71 17.86 7.14 7.14 0.00 7.14
## 5 Other 39.29 32.14 3.57 14.29 0.00 10.71
## 6 White 56.29 14.89 12.87 5.76 4.13 6.05
#Plotting the above table in a stacked bar
ggplot(race_armed_data, aes(x = race, y = Deaths, fill = armed_mod)) +
geom_bar(stat = "identity") +
labs (x = 'Race', y = 'Number of deaths') +
ggtitle('How were suspects/victims armed by Race') +
scale_x_discrete(labels=c('Asian',
'Black',
'Hispanic',
'Native American',
'Other',
'White')) +
theme_few()
We looked at the distribution of deaths by suspects’ race and whether they were trying to flee or not. Following are some of the interesting observations:
#-----------------------------Interesting Finding 5---------------------------------------------#
#Filtering the data and summarizing it
race_flee_data <- police %>%
filter(race != '') %>%
group_by(race, flee) %>%
summarise(Deaths = n()) %>%
spread(flee, Deaths)
#Replacing the missing values with 0
race_flee_data[is.na(race_flee_data)] <- 0
print("% distribution of deaths by suspects' status (Fleeing or not fleeing) by Race")
## [1] "% distribution of deaths by suspects' status (Fleeing or not fleeing) by Race"
#Printing the table
summary_table <- cbind(as.data.frame(race_flee_data[,1]), as.data.frame(round(race_flee_data[,-1]/rowSums(race_flee_data[,-1])*100, 2)))
#Renaming the race values
levels(summary_table$race) <- c("", "Asian", "Black", "Hispanic", "Native American", "Other", "White")
#Print the table
print(as.data.frame(summary_table))
## race V1 Car Foot Not fleeing Other
## 1 Asian 0.00 9.09 9.09 81.82 0.00
## 2 Black 1.48 15.13 19.00 60.70 3.69
## 3 Hispanic 1.63 16.89 11.72 64.85 4.90
## 4 Native American 7.14 7.14 17.86 67.86 0.00
## 5 Other 0.00 14.29 7.14 75.00 3.57
## 6 White 1.15 14.89 8.84 71.37 3.75
We looked into the monthly trend for two years and used ARIMA to forecast the crime for next four months. Since, there is not much seasonality into the police shootings, even the forecast predicts average shootings for the next four months with a very wide confidence interval.
#--------------------------Interesting Pattern 6-------------------------------------#
#Summarizing the data at year month level and plotting the trend line
police %>%
mutate(year_month = format(as.Date(date, '%d-%m-%Y'),"%Y_%m")) %>%
group_by(year_month) %>%
summarise(n = n()) %>%
#Plot the line
ggplot(aes(x = year_month, y = n, group = 1)) +
geom_line(color = "blue") +
geom_point(color = "blue") +
#Axes labels and titles
labs(x = "Year and Month of death occurrence",
y = "Number of deaths",
title = "Number of deaths Vs. Year and Month of death occurrence") +
scale_x_discrete(breaks =
levels(as.factor(format(as.Date(police$date, '%d-%m-%Y'),"%Y_%m")))[c(T, rep(F, 11))]) +
theme_bw()
Forecasting the next four months deaths using the previous months data by using ARIMA
#------------------------------Forecasting crime using ARIMA----------------------------------#
#Number of deaths by year and month
number_of_crimes <- police %>%
mutate(year_month = format(as.Date(date, '%d-%m-%Y'),"%Y_%m")) %>%
group_by(year_month) %>%
summarise(n = n()) %>%
select(n)
#Converting number_of_crimes in to time series object
crime_ts <- ts(number_of_crimes)
#Fitting the best arima model
model <- auto.arima(crime_ts, stepwise = FALSE, approximation = FALSE)
#Predict next 4 months based on the model created
predict <- model %>% forecast(level = c(95), h = 5)
#Plot the graph
print(predict %>%
autoplot() +
labs(x = "Year and Month from Jan 2015 to Feb 2017",
y = "Number of deaths",
title = paste0("Death due to police shooting forecast for the next four months")) +
theme_bw())